# run with ray up ray_k8s_sky.yaml --no-config-cache
# An unique identifier for the head node and workers of this cluster.
cluster_name: example-cluster

# The maximum number of workers nodes to launch in addition to the head
# node.
min_workers: 0
max_workers: 0

# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
    type: external
    module: sky.skylet.providers.kubernetes.KubernetesNodeProvider

    # Use False if running from outside of k8s cluster
    use_internal_ips: false

    # Namespace to use for all resources created.
    namespace: default

    # ServiceAccount created by the autoscaler for the head node pod that it
    # runs in. If this field isn't provided, the head pod config below must
    # contain a user-created service account with the proper permissions.
    autoscaler_service_account:
        apiVersion: v1
        kind: ServiceAccount
        metadata:
            labels:
                parent: skypilot
            name: autoscaler

    # Role created by the autoscaler for the head node pod that it runs in.
    # If this field isn't provided, the role referenced in
    # autoscaler_role_binding must exist and have at least these permissions.
    autoscaler_role:
        kind: Role
        apiVersion: rbac.authorization.k8s.io/v1
        metadata:
            labels:
                parent: skypilot
            name: autoscaler
        rules:
        - apiGroups: [""]
          resources: ["pods", "pods/status", "pods/exec"]
          verbs: ["get", "watch", "list", "create", "delete", "patch"]

    # RoleBinding created by the autoscaler for the head node pod that it runs
    # in. If this field isn't provided, the head pod config below must contain
    # a user-created service account with the proper permissions.
    autoscaler_role_binding:
        apiVersion: rbac.authorization.k8s.io/v1
        kind: RoleBinding
        metadata:
            labels:
                parent: skypilot
            name: autoscaler
        subjects:
        - kind: ServiceAccount
          name: autoscaler
        roleRef:
            kind: Role
            name: autoscaler
            apiGroup: rbac.authorization.k8s.io

    services:
      # Service to expose the head node pod's SSH port.
      - apiVersion: v1
        kind: Service
        metadata:
          labels:
            parent: skypilot
          name: example-cluster-head-ssh
        spec:
          type: NodePort
          selector:
            component: example-cluster-head
          ports:
            - protocol: TCP
              port: 22
              targetPort: 22
      # Service that maps to the head node of the Ray cluster.
      - apiVersion: v1
        kind: Service
        metadata:
            labels:
              parent: skypilot
            # NOTE: If you're running multiple Ray clusters with services
            # on one Kubernetes cluster, they must have unique service
            # names.
            name: example-cluster-head
        spec:
            # This selector must match the head node pod's selector below.
            selector:
                component: example-cluster-head
            ports:
                - name: client
                  protocol: TCP
                  port: 10001
                  targetPort: 10001
                - name: dashboard
                  protocol: TCP
                  port: 8265
                  targetPort: 8265

# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
  worker_node:
    # Minimum number of Ray workers of this Pod type.
    min_workers: 0
    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
    max_workers: 0
    # User-specified custom resources for use by Ray. Object with string keys and integer values.
    # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
    resources: {"example-resource-a": 1, "example-resource-b": 2}
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        labels:
          parent: skypilot
        # Automatically generates a name for the pod with this prefix.
        generateName: example-cluster-ray-worker-
      spec:
        restartPolicy: Never
        volumes:
        - name: secret-volume
          secret:
            secretName: ssh-key-secret
        - name: dshm
          emptyDir:
            medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Never
          image: skypilot:latest
          command: ["/bin/bash", "-c", "--"]
          args: ["trap : TERM INT; sleep infinity & wait;"]
          lifecycle:
            postStart:
              exec:
                command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys && sudo service ssh restart"]
          ports:
          - containerPort: 22  # Used for SSH
          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
          - name: secret-volume
            readOnly: true
            mountPath: "/etc/secret-volume"
          - mountPath: /dev/shm
            name: dshm
          resources:
            requests:
              cpu: 1000m
              memory: 1024Mi
            limits:
              # The maximum memory that this pod is allowed to use. The
              # limit will be detected by ray and split to use 10% for
              # redis, 30% for the shared memory object store, and the
              # rest for application memory. If this limit is not set and
              # the object store size is not set manually, ray will
              # allocate a very large object store in each pod that may
              # cause problems for other pods.
              memory: 1024Mi
  head_node:
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: example-cluster-head-
        # Must match the head node service selector above if a head node
        # service is required.
        labels:
            parent: skypilot
            component: example-cluster-head
      spec:
        # Change this if you altered the autoscaler_service_account above
        # or want to provide your own.
        serviceAccountName: autoscaler

        restartPolicy: Never

        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumes:
        - name: secret-volume
          secret:
            secretName: ssh-key-secret
        - name: dshm
          emptyDir:
            medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Never
          image: skypilot:latest
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ['trap : TERM INT; sleep infinity & wait;']
          ports:
          - containerPort: 22  # Used for SSH
          - containerPort: 6379  # Redis port
          - containerPort: 10001  # Used by Ray Client
          - containerPort: 8265  # Used by Ray Dashboard

          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
          - name: secret-volume
            readOnly: true
            mountPath: "/etc/secret-volume"
          - mountPath: /dev/shm
            name: dshm
          lifecycle:
            postStart:
              exec:
                command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys && sudo service ssh restart"]
          resources:
            requests:
              cpu: 1000m
              memory: 1024Mi
            limits:
              # The maximum memory that this pod is allowed to use. The
              # limit will be detected by ray and split to use 10% for
              # redis, 30% for the shared memory object store, and the
              # rest for application memory. If this limit is not set and
              # the object store size is not set manually, ray will
              # allocate a very large object store in each pod that may
              # cause problems for other pods.
              memory: 1024Mi

# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0 --object-store-memory 78643201

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-store-memory 78643201


auth:
  ssh_user: sky
  ssh_private_key: ~/.ssh/sky-key

# These fields are required for external cloud providers.
setup_commands: []
head_setup_commands: []
worker_setup_commands: []
cluster_synced_files: []
file_mounts_sync_continuously: False
file_mounts: {}
initialization_commands: []
